You can find important course-specific tips and notes for your Lab Sandbox in this
quick guide to use throughout your course. You’ll be able to reference this at any time
or visit the Learner Help Center
for more info.
Throughout this course, you'll encounter datasets which are hosted on other websites or are linked from the course instructional materials. If you'd like to complete your work in the lab sandbox environment, please download these datasets from their listed websites and upload the data files directly into your RStudio lab environment. Lab Sandboxes have limited access to external sites, so uploading your data files directly will help ensure you do not encounter any access errors.
Happy Learning!
! pip install turicreate==6.4.1
!unzip people_wiki.sframe.zip
from __future__ import print_function # to conform python 2.x print to python 3.x
import turicreate
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
wiki = turicreate.SFrame('people_wiki.sframe')
wiki
wiki['word_count'] = turicreate.text_analytics.count_words(wiki['text'])
model = turicreate.nearest_neighbors.create(wiki, label='name', features=['word_count'],
method='brute_force', distance='euclidean')
model.query(wiki[wiki['name']=='Barack Obama'], label='name', k=10)
def top_words(name):
"""
Get a table of the most frequent words in the given person's wikipedia page.
"""
row = wiki[wiki['name'] == name]
word_count_table = row[['word_count']].stack('word_count', new_column_name=['word','count'])
return word_count_table.sort('count', ascending=False)
obama_words = top_words('Barack Obama')
obama_words
barrio_words = top_words('Francisco Barrio')
barrio_words
combined_words = obama_words.join(barrio_words, on='word')
combined_words
combined_words = combined_words.rename({'count':'Obama', 'count.1':'Barrio'})
combined_words
combined_words.sort('Obama', ascending=False)[:5]
common_words = set(combined_words.sort('Obama', ascending=False)[:5]['word']) # YOUR CODE HERE
def has_top_words(word_count_vector):
# extract the keys of word_count_vector and convert it to a set
unique_words = set(word_count_vector.keys()) # YOUR CODE HERE
# return True if common_words is a subset of unique_words
# return False otherwise
return common_words.issubset(unique_words) # YOUR CODE HERE
wiki['has_top_words'] = wiki['word_count'].apply(has_top_words)
# use has_top_words column to answer the quiz question
... # YOUR CODE HERE
wiki['has_top_words'].sum()
len(wiki['has_top_words'])
print('Output from your function:', has_top_words(wiki[32]['word_count']))
print('Correct output: True')
print('Also check the length of unique_words. It should be 167')
print(len(wiki[32]['word_count']))
print('Output from your function:', has_top_words(wiki[33]['word_count']))
print('Correct output: False')
print('Also check the length of unique_words. It should be 188')
print(len(wiki[33]['word_count']))
o = wiki[wiki['name'] == 'Barack Obama']['word_count'][0]
b = wiki[wiki['name'] == 'George W. Bush']['word_count'][0]
j = wiki[wiki['name'] == 'Joe Biden']['word_count'][0]
turicreate.toolkits.distances.euclidean(o, b)
turicreate.toolkits.distances.euclidean(o, j)
turicreate.toolkits.distances.euclidean(j, b)
bush_words = top_words('George W. Bush')
combined_words = obama_words.join(bush_words, on='word')
combined_words = combined_words.rename({'count':'Obama', 'count.1':'Bush'})
combined_words.sort('Obama', ascending=False)
wiki['tf_idf'] = turicreate.text_analytics.tf_idf(wiki['word_count'])
model_tf_idf = turicreate.nearest_neighbors.create(wiki, label='name', features=['tf_idf'],
method='brute_force', distance='euclidean')
model_tf_idf.query(wiki[wiki['name'] == 'Barack Obama'], label='name', k=10)
def top_words_tf_idf(name):
row = wiki[wiki['name'] == name]
word_count_table = row[['tf_idf']].stack('tf_idf', new_column_name=['word','weight'])
return word_count_table.sort('weight', ascending=False)
obama_tf_idf = top_words_tf_idf('Barack Obama')
obama_tf_idf
schiliro_tf_idf = top_words_tf_idf('Phil Schiliro')
schiliro_tf_idf
combined_words = obama_tf_idf.join(schiliro_tf_idf, on='word')
combined_words = combined_words.rename({'weight':'Obama', 'weight.1':'Schiliro'})
combined_words.sort('Obama', ascending=False)
common_words = set(combined_words.sort('Obama', ascending=False)[:5]['word']) # YOUR CODE HERE
def has_top_words(word_count_vector):
# extract the keys of word_count_vector and convert it to a set
unique_words = set(word_count_vector.keys()) # YOUR CODE HERE
# return True if common_words is a subset of unique_words
# return False otherwise
return common_words.issubset(unique_words) # YOUR CODE HERE
wiki['has_top_words'] = wiki['word_count'].apply(has_top_words)
# use has_top_words column to answer the quiz question
... # YOUR CODE HERE
wiki['has_top_words'].sum()
# use has_top_words column to answer the quiz question
# ... # YOUR CODE HERE
common_words
biden_tf_idf = top_words_tf_idf('Joe Biden')
biden_tf_idf
combined_words = obama_tf_idf.join(biden_tf_idf, on='word')
combined_words['weight'], combined_words['weight.1']
combined_words
turicreate.toolkits.distances.euclidean(list(combined_words['weight']), list(combined_words['weight.1']))
turicreate.toolkits.distances.euclidean(wiki[wiki['name'] == 'Barack Obama']['tf_idf'][0],
wiki[wiki['name'] == 'Joe Biden']['tf_idf'][0])
model_tf_idf.query(wiki[wiki['name'] == 'Barack Obama'], label='name', k=10)
def compute_length(row):
return len(row['text'].split(' '))
wiki['length'] = wiki.apply(compute_length)
nearest_neighbors_euclidean = model_tf_idf.query(wiki[wiki['name'] == 'Barack Obama'], label='name', k=100)
nearest_neighbors_euclidean = nearest_neighbors_euclidean.join(wiki[['name', 'length']], on={'reference_label':'name'})
nearest_neighbors_euclidean.sort('rank')
plt.figure(figsize=(10.5,4.5))
plt.hist(wiki['length'], 50, color='k', edgecolor='None', histtype='stepfilled', normed=True,
label='Entire Wikipedia', zorder=3, alpha=0.8)
plt.hist(nearest_neighbors_euclidean['length'], 50, color='r', edgecolor='None', histtype='stepfilled', normed=True,
label='100 NNs of Obama (Euclidean)', zorder=10, alpha=0.8)
plt.axvline(x=wiki['length'][wiki['name'] == 'Barack Obama'][0], color='k', linestyle='--', linewidth=4,
label='Length of Barack Obama', zorder=2)
plt.axvline(x=wiki['length'][wiki['name'] == 'Joe Biden'][0], color='g', linestyle='--', linewidth=4,
label='Length of Joe Biden', zorder=1)
plt.axis([0, 1000, 0, 0.04])
plt.legend(loc='best', prop={'size':15})
plt.title('Distribution of document length')
plt.xlabel('# of words')
plt.ylabel('Percentage')
plt.rcParams.update({'font.size':16})
plt.tight_layout()
model2_tf_idf = turicreate.nearest_neighbors.create(wiki, label='name', features=['tf_idf'],
method='brute_force', distance='cosine')
nearest_neighbors_cosine = model2_tf_idf.query(wiki[wiki['name'] == 'Barack Obama'], label='name', k=100)
nearest_neighbors_cosine = nearest_neighbors_cosine.join(wiki[['name', 'length']], on={'reference_label':'name'})
nearest_neighbors_cosine.sort('rank')
plt.figure(figsize=(10.5,4.5))
plt.figure(figsize=(10.5,4.5))
plt.hist(wiki['length'], 50, color='k', edgecolor='None', histtype='stepfilled', normed=True,
label='Entire Wikipedia', zorder=3, alpha=0.8)
plt.hist(nearest_neighbors_euclidean['length'], 50, color='r', edgecolor='None', histtype='stepfilled', normed=True,
label='100 NNs of Obama (Euclidean)', zorder=10, alpha=0.8)
plt.hist(nearest_neighbors_cosine['length'], 50, color='b', edgecolor='None', histtype='stepfilled', normed=True,
label='100 NNs of Obama (cosine)', zorder=11, alpha=0.8)
plt.axvline(x=wiki['length'][wiki['name'] == 'Barack Obama'][0], color='k', linestyle='--', linewidth=4,
label='Length of Barack Obama', zorder=2)
plt.axvline(x=wiki['length'][wiki['name'] == 'Joe Biden'][0], color='g', linestyle='--', linewidth=4,
label='Length of Joe Biden', zorder=1)
plt.axis([0, 1000, 0, 0.04])
plt.legend(loc='best', prop={'size':15})
plt.title('Distribution of document length')
plt.xlabel('# of words')
plt.ylabel('Percentage')
plt.rcParams.update({'font.size': 16})
plt.tight_layout()
sf = turicreate.SFrame({'text': ['democratic governments control law in response to popular act']})
sf['word_count'] = turicreate.text_analytics.count_words(sf['text'])
encoder = turicreate.toolkits._feature_engineering.TFIDF(features=['word_count'], output_column_prefix='tf_idf')
encoder.fit(wiki)
sf = encoder.transform(sf)
sf
tweet_tf_idf = sf[0]['tf_idf.word_count']
tweet_tf_idf
obama = wiki[wiki['name'] == 'Barack Obama']
obama
obama_tf_idf = obama[0]['tf_idf']
turicreate.toolkits.distances.cosine(obama_tf_idf, tweet_tf_idf)
model2_tf_idf.query(obama, label='name', k=10)
from __future__ import print_function # to conform python 2.x print to python 3.x
import numpy as np
import turicreate
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import pairwise_distances
import time
from copy import copy
import matplotlib.pyplot as plt
%matplotlib inline
'''compute norm of a sparse vector
Thanks to: Jaiyam Sharma'''
def norm(x):
sum_sq=x.dot(x.T)
norm=np.sqrt(sum_sq)
return(norm)
wiki = turicreate.SFrame('people_wiki.sframe/')
wiki = wiki.add_row_number()
wiki['tf_idf'] = turicreate.text_analytics.tf_idf(wiki['text'])
wiki.head()
def sframe_to_scipy(x, column_name):
'''
Convert a dictionary column of an SFrame into a sparse matrix format where
each (row_id, column_id, value) triple corresponds to the value of
x[row_id][column_id], where column_id is a key in the dictionary.
Example
>>> sparse_matrix, map_key_to_index = sframe_to_scipy(sframe, column_name)
'''
assert type(x[column_name][0]) == dict, \
'The chosen column must be dict type, representing sparse data.'
# Stack will transform x to have a row for each unique (row, key) pair.
x = x.stack(column_name, ['feature', 'value'])
# Map feature words to integers
unique_words = sorted(x['feature'].unique())
mapping = {word:i for i, word in enumerate(unique_words)}
x['feature_id'] = x['feature'].apply(lambda x: mapping[x])
# Create numpy arrays that contain the data for the sparse matrix.
row_id = np.array(x['id'])
col_id = np.array(x['feature_id'])
data = np.array(x['value'])
width = x['id'].max() + 1
height = x['feature_id'].max() + 1
# Create a sparse matrix.
mat = csr_matrix((data, (row_id, col_id)), shape=(width, height))
return mat, mapping
%%time
corpus, mapping = sframe_to_scipy(wiki, 'tf_idf')
assert corpus.shape == (59071, 547979)
print('Check passed correctly!')
def generate_random_vectors(dim, n_vectors):
return np.random.randn(dim, n_vectors)
# Generate 16 random vectors of dimension 547979
np.random.seed(0)
n_vectors = 16
random_vectors = generate_random_vectors(corpus.shape[1], n_vectors)
random_vectors.shape
sample = corpus[0] # vector of tf-idf values for document 0
bin_indices_bits = sample.dot(random_vectors[:,0]) >= 0
bin_indices_bits
sample.dot(random_vectors[:, 1]) >= 0 # True if positive sign; False if negative sign
sample.dot(random_vectors) >= 0 # should return an array of 16 True/False bits
np.array(sample.dot(random_vectors) >= 0, dtype=int) # display index bits in 0/1's
corpus[0:2].dot(random_vectors) >= 0 # compute bit indices of first two documents
corpus.dot(random_vectors) >= 0 # compute bit indices of ALL documents
index_bits = (sample.dot(random_vectors) >= 0)
powers_of_two = (1 << np.arange(15, -1, -1))
print(index_bits)
print(powers_of_two)
print(index_bits.dot(powers_of_two))
index_bits = sample.dot(random_vectors) >= 0
index_bits.dot(powers_of_two)
from collections import defaultdict
def train_lsh(data, n_vectors, seed=None):
if seed is not None:
np.random.seed(seed)
dim = data.shape[1]
random_vectors = generate_random_vectors(dim, n_vectors)
# Partition data points into bins,
# and encode bin index bits into integers
bin_indices_bits = data.dot(random_vectors) >= 0
powers_of_two = 1 << np.arange(n_vectors - 1, -1, step=-1)
bin_indices = bin_indices_bits.dot(powers_of_two)
# Update `table` so that `table[i]` is the list of document ids with bin index equal to i
table = defaultdict(list)
for idx, bin_index in enumerate(bin_indices):
# Fetch the list of document ids associated with the bin and add the document id to the end.
# data_index: document ids
# append() will add a list of document ids to table dict() with key as bin_index
table[bin_index].append(idx) # YOUR CODE HERE
# Note that we're storing the bin_indices here
# so we can do some ad-hoc checking with it,
# this isn't actually required
model = {'data': data,
'table': table,
'random_vectors': random_vectors,
'bin_indices': bin_indices,
'bin_indices_bits': bin_indices_bits}
return model
def compare_bits(model, id_1, id_2):
bits1 = model['bin_indices_bits'][id_1]
bits2 = model['bin_indices_bits'][id_2]
print('Number of agreed bits: ', np.sum(bits1 == bits2))
return np.sum(bits1 == bits2)
model = train_lsh(corpus, 16, seed=475)
obama_id = wiki[wiki['name'] == 'Barack Obama']['id'][0]
biden_id = wiki[wiki['name'] == 'Joe Biden']['id'][0]
similariy = compare_bits(model, obama_id, biden_id)
# This function will help us get similar items, given the id
def get_similarity_items(X_tfidf, item_id, topn=5):
"""
Get the top similar items for a given item id.
The similarity measure here is based on cosine distance.
"""
query = X_tfidf[item_id]
scores = X_tfidf.dot(query.T).toarray().ravel()
best = np.argpartition(scores, -topn)[-topn:]
similar_items = sorted(zip(best, scores[best]), key=lambda x: -x[1])
similar_item_ids = [similar_item for similar_item, _ in similar_items]
print("Similar items to id: {}".format(item_id))
for _id in similar_item_ids:
print(wiki[_id]['name'])
print('\n')
return similar_item_ids
wiki[wiki['name'] == 'Barack Obama']
obama_id
s = ''.join(map(str,model['bin_indices_bits'][obama_id].astype(int)))
sum(int(c) * (2 ** i) for i, c in enumerate(s[::-1]))
#s = '1110'
#sum(int(c) * (2 ** i) for i, c in enumerate(s[::-1]))
wiki[wiki['name'] == 'Joe Biden']
so = ''.join(map(str,model['bin_indices_bits'][obama_id].astype(int)))
sb = ''.join(map(str,model['bin_indices_bits'][biden_id].astype(int)))
sum([so[i]==sb[i] for i in range(len(so))])
jones_id = wiki[wiki['name']=='Wynn Normington Hugh-Jones']['id'][0]
compare_bits(model, obama_id, jones_id)
model['bin_indices'][obama_id]
model['table'][model['bin_indices'][obama_id]]
doc_ids = list(model['table'][model['bin_indices'][35817]])
doc_ids.remove(35817) # display documents other than Obama
docs = wiki.filter_by(values=doc_ids, column_name='id') # filter by id column
docs
res = compare_bits(model, obama_id, docs[0]['id']), compare_bits(model, obama_id, biden_id)
from itertools import combinations
num_vector = 16
search_radius = 3
for diff in combinations(range(num_vector), search_radius):
print(diff)
def search_nearby_bins(query_bin_bits, table, search_radius=2, initial_candidates=set()):
"""
For a given query vector and trained LSH model, return all candidate neighbors for
the query among all bins within the given search radius.
Example usage
-------------
>>> model = train_lsh(corpus, num_vector=16, seed=143)
>>> q = model['bin_index_bits'][0] # vector for the first document
>>> candidates = search_nearby_bins(q, model['table'])
"""
num_vector = len(query_bin_bits)
powers_of_two = 1 << np.arange(num_vector-1, -1, -1)
# Allow the user to provide an initial set of candidates.
candidate_set = copy(initial_candidates)
for different_bits in combinations(range(num_vector), search_radius):
# Flip the bits (n_1,n_2,...,n_r) of the query bin to produce a new bit vector.
## Hint: you can iterate over a tuple like a list
alternate_bits = copy(query_bin_bits)
for i in different_bits:
alternate_bits[i] = 1 - alternate_bits[i] # YOUR CODE HERE
# Convert the new bit vector to an integer index
nearby_bin = alternate_bits.dot(powers_of_two)
# Fetch the list of documents belonging to the bin indexed by the new bit vector.
# Then add those documents to candidate_set
# Make sure that the bin exists in the table!
# Hint: update() method for sets lets you add an entire list to the set
if nearby_bin in table:
more_docs = table[nearby_bin] # Get all document_ids of the bin
... # YOUR CODE HERE: Update candidate_set with the documents in this bin.
candidate_set.update(more_docs)
return candidate_set
obama_bin_index = model['bin_indices_bits'][35817] # bin index of Barack Obama
candidate_set = search_nearby_bins(obama_bin_index, model['table'], search_radius=0)
if candidate_set == set({35817, 54743}):
print('Passed test')
else:
print('Check your code')
print('List of documents in the same bin as Obama: {}'.format(candidate_set))
candidate_set = search_nearby_bins(obama_bin_index, model['table'], search_radius=1, initial_candidates=candidate_set)
if candidate_set == set({42243, 28804, 1810, 48919, 24478, 31010, 7331, 23716, 51108, 48040, 36266, 33200, 25023, 23617, 54743, 34910, 35817, 34159, 14451, 23926, 39032, 12028, 43775}):
print('Passed test')
else:
print('Check your code')
print(candidate_set)
def query(vec, model, k, max_search_radius):
data = model['data']
table = model['table']
random_vectors = model['random_vectors']
num_vector = random_vectors.shape[1]
# Compute bin index for the query vector, in bit representation.
bin_index_bits = (vec.dot(random_vectors) >= 0).flatten()
# Search nearby bins and collect candidates
candidate_set = set()
for search_radius in range(max_search_radius+1):
candidate_set = search_nearby_bins(bin_index_bits, table, search_radius, initial_candidates=candidate_set)
# Sort candidates by their true distances from the query
nearest_neighbors = turicreate.SFrame({'id':candidate_set})
candidates = data[np.array(list(candidate_set)),:]
nearest_neighbors['distance'] = pairwise_distances(candidates, vec, metric='cosine').flatten()
return nearest_neighbors.topk('distance', k, reverse=True), len(candidate_set)
query(corpus[35817,:], model, k=10, max_search_radius=3)
query(corpus[35817,:], model, k=10, max_search_radius=3)[0].join(wiki[['id', 'name']], on='id').sort('distance')
wiki[wiki['name']=='Barack Obama']
%%time
num_candidates_history = []
query_time_history = []
max_distance_from_query_history = []
min_distance_from_query_history = []
average_distance_from_query_history = []
for max_search_radius in range(17):
start=time.time()
result, num_candidates = query(corpus[35817,:], model, k=10,
max_search_radius=max_search_radius)
end=time.time()
query_time = end-start
print('Radius:', max_search_radius)
print(result.join(wiki[['id', 'name']], on='id').sort('distance'))
#res = list(result.join(wiki[['id', 'name']], on='id')['distance'])
#print(res)
#print(np.mean(res))
average_distance_from_query = result['distance'][1:].mean()
max_distance_from_query = result['distance'][1:].max()
min_distance_from_query = result['distance'][1:].min()
print(average_distance_from_query)
num_candidates_history.append(num_candidates)
query_time_history.append(query_time)
average_distance_from_query_history.append(average_distance_from_query)
max_distance_from_query_history.append(max_distance_from_query)
min_distance_from_query_history.append(min_distance_from_query)
plt.figure(figsize=(7,4.5))
plt.plot(num_candidates_history, linewidth=4)
plt.xlabel('Search radius')
plt.ylabel('# of documents searched')
plt.rcParams.update({'font.size':16})
plt.tight_layout()
plt.figure(figsize=(7,4.5))
plt.plot(query_time_history, linewidth=4)
plt.xlabel('Search radius')
plt.ylabel('Query time (seconds)')
plt.rcParams.update({'font.size':16})
plt.tight_layout()
plt.figure(figsize=(7,4.5))
plt.plot(average_distance_from_query_history, linewidth=4, label='Average of 10 neighbors')
plt.plot(max_distance_from_query_history, linewidth=4, label='Farthest of 10 neighbors')
plt.plot(min_distance_from_query_history, linewidth=4, label='Closest of 10 neighbors')
plt.xlabel('Search radius')
plt.ylabel('Cosine distance of neighbors')
plt.legend(loc='best', prop={'size':15})
plt.rcParams.update({'font.size':16})
plt.tight_layout()
from __future__ import print_function # to conform python 2.x print to python 3.x
import turicreate
import matplotlib.pyplot as plt
import numpy as np
import sys
import os
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
%matplotlib inline
wiki = turicreate.SFrame('people_wiki.sframe/')
wiki['tf_idf'] = turicreate.text_analytics.tf_idf(wiki['text'])
def sframe_to_scipy(x, column_name):
'''
Convert a dictionary column of an SFrame into a sparse matrix format where
each (row_id, column_id, value) triple corresponds to the value of
x[row_id][column_id], where column_id is a key in the dictionary.
Example
>>> sparse_matrix, map_key_to_index = sframe_to_scipy(sframe, column_name)
'''
assert type(x[column_name][0]) == dict, \
'The chosen column must be dict type, representing sparse data.'
# 1. Add a row number (id)
x = x.add_row_number()
# 2. Stack will transform x to have a row for each unique (row, key) pair.
x = x.stack(column_name, ['feature', 'value'])
# Map feature words to integers
unique_words = sorted(x['feature'].unique())
mapping = {word:i for i, word in enumerate(unique_words)}
x['feature_id'] = x['feature'].apply(lambda x: mapping[x])
# Create numpy arrays that contain the data for the sparse matrix.
row_id = np.array(x['id'])
col_id = np.array(x['feature_id'])
data = np.array(x['value'])
width = x['id'].max() + 1
height = x['feature_id'].max() + 1
# Create a sparse matrix.
mat = csr_matrix((data, (row_id, col_id)), shape=(width, height))
return mat, mapping
%%time
# The conversion will take about a minute or two.
tf_idf, map_index_to_word = sframe_to_scipy(wiki, 'tf_idf')
tf_idf.shape
from sklearn.preprocessing import normalize
tf_idf = normalize(tf_idf)
def get_initial_centroids(data, k, seed=None):
'''Randomly choose k data points as initial centroids'''
if seed is not None: # useful for obtaining consistent results
np.random.seed(seed)
n = data.shape[0] # number of data points
# Pick K indices from range [0, N).
rand_indices = np.random.randint(0, n, k)
# Keep centroids as dense format, as many entries will be nonzero due to averaging.
# As long as at least one document in a cluster contains a word,
# it will carry a nonzero weight in the TF-IDF vector of the centroid.
centroids = data[rand_indices,:].toarray()
return centroids
from sklearn.metrics import pairwise_distances
# Get the TF-IDF vectors for documents 100 through 102.
queries = tf_idf[100:102,:]
# Compute pairwise distances from every data point to each query vector.
dist = pairwise_distances(tf_idf, queries, metric='euclidean')
print(dist)
k = 3
centroids = tf_idf[:k,:]
distances = pairwise_distances(tf_idf, centroids, metric='euclidean')
print(distances)
dist = pairwise_distances(tf_idf[430,:], centroids[1], metric='euclidean')
print(dist)
'''Test cell'''
if np.allclose(dist, pairwise_distances(tf_idf[430,:], tf_idf[1,:])):
print('Pass')
else:
print('Check your code again')
closest_cluster = np.argmin(distances, 1)
closest_cluster
'''Test cell'''
reference = [list(row).index(min(row)) for row in distances]
if np.allclose(closest_cluster, reference):
print('Pass')
else:
print('Check your code again')
def get_cluster_assignments(data, centroids):
distances = pairwise_distances(data, centroids, metric='euclidean')
return np.argmin(distances, 1)
cluster_assignment = get_cluster_assignments(tf_idf, centroids)
if len(cluster_assignment)==59071 and \
np.array_equal(np.bincount(cluster_assignment), np.array([23061, 10086, 25924])):
print('Pass') # count number of data points for each cluster
else:
print('Check your code again.')
def assign_clusters(data, centroids):
# Compute distances between each data point and the set of centroids:
# Fill in the blank (RHS only)
#distances_from_centroids = ... # YOUR CODE HERE
# Compute cluster assignments for each data point:
# Fill in the blank (RHS only)
cluster_assignment = get_cluster_assignments(data, centroids) # YOUR CODE HERE
return cluster_assignment
if np.allclose(assign_clusters(tf_idf[0:100:10], tf_idf[0:8:2]), np.array([0, 1, 1, 0, 0, 2, 0, 2, 2, 1])):
print('Pass')
else:
print('Check your code again.')
data = np.array([[1., 2., 0.],
[0., 0., 0.],
[2., 2., 0.]])
centroids = np.array([[0.5, 0.5, 0.],
[0., -0.5, 0.]])
cluster_assignment = assign_clusters(data, centroids)
print(cluster_assignment)
cluster_assignment==1
cluster_assignment==0
data[cluster_assignment==1]
data[cluster_assignment==0]
data[cluster_assignment==0].mean(axis=0)
def revise_centroids(data, k, cluster_assignment):
new_centroids = []
for i in range(k):
# Select all data points that belong to cluster i. Fill in the blank (RHS only)
member_data_points = data[cluster_assignment == i] # YOUR CODE HERE
# Compute the mean of the data points. Fill in the blank (RHS only)
centroid = np.mean(member_data_points, axis=0) # YOUR CODE HERE
# Convert numpy.matrix type to numpy.ndarray type
centroid = centroid.A1
new_centroids.append(centroid)
new_centroids = np.array(new_centroids)
return new_centroids
result = revise_centroids(tf_idf[0:100:10], 3, np.array([0, 1, 1, 0, 0, 2, 0, 2, 2, 1]))
if np.allclose(result[0], np.mean(tf_idf[[0,30,40,60]].toarray(), axis=0)) and \
np.allclose(result[1], np.mean(tf_idf[[10,20,90]].toarray(), axis=0)) and \
np.allclose(result[2], np.mean(tf_idf[[50,70,80]].toarray(), axis=0)):
print('Pass')
else:
print('Check your code')
def compute_heterogeneity(data, k, centroids, cluster_assignment):
heterogeneity = 0.0
for i in range(k):
# Select all data points that belong to cluster i. Fill in the blank (RHS only)
member_data_points = data[cluster_assignment==i, :]
if member_data_points.shape[0] > 0: # check if i-th cluster is non-empty
# Compute distances from centroid to data points (RHS only)
distances = pairwise_distances(member_data_points, [centroids[i]], metric='euclidean')
squared_distances = distances**2
heterogeneity += np.sum(squared_distances)
return heterogeneity
compute_heterogeneity(data, 2, centroids, cluster_assignment)
# Fill in the blanks
def kmeans(data, k, initial_centroids, maxiter, record_heterogeneity=None, verbose=False):
'''This function runs k-means on given data and initial set of centroids.
maxiter: maximum number of iterations to run.
record_heterogeneity: (optional) a list, to store the history of heterogeneity as function of iterations
if None, do not store the history.
verbose: if True, print how many data points changed their cluster labels in each iteration'''
centroids = initial_centroids[:]
prev_cluster_assignment = None
for itr in range(maxiter):
if verbose:
print(itr)
# 1. Make cluster assignments using nearest centroids
# YOUR CODE HERE
cluster_assignment = assign_clusters(data, centroids)
# 2. Compute a new centroid for each of the k clusters, averaging all data points assigned to that cluster.
# YOUR CODE HERE
centroids = revise_centroids(data, k, cluster_assignment)
# Check for convergence: if none of the assignments changed, stop
if prev_cluster_assignment is not None and \
(prev_cluster_assignment==cluster_assignment).all():
break
# Print number of new assignments
if prev_cluster_assignment is not None:
num_changed = np.sum(prev_cluster_assignment!=cluster_assignment)
if verbose:
print(' {0:5d} elements changed their cluster assignment.'.format(num_changed))
# Record heterogeneity convergence metric
if record_heterogeneity is not None:
# YOUR CODE HERE
score = compute_heterogeneity(data, k, centroids, cluster_assignment)
record_heterogeneity.append(score)
prev_cluster_assignment = cluster_assignment[:]
return centroids, cluster_assignment
def plot_heterogeneity(heterogeneity, k):
plt.figure(figsize=(7,4))
plt.plot(heterogeneity, linewidth=4)
plt.xlabel('# Iterations')
plt.ylabel('Heterogeneity')
plt.title('Heterogeneity of clustering over time, K={0:d}'.format(k))
plt.rcParams.update({'font.size': 16})
plt.tight_layout()
k = 3
heterogeneity = []
initial_centroids = get_initial_centroids(tf_idf, k, seed=0)
centroids, cluster_assignment = kmeans(tf_idf, k, initial_centroids, maxiter=400,
record_heterogeneity=heterogeneity, verbose=True)
plot_heterogeneity(heterogeneity, k)
np.bincount(cluster_assignment)
k = 10
heterogeneity = {}
cluster_assignment_dict = {}
import time
start = time.time()
for seed in [0, 20000, 40000, 60000, 80000, 100000, 120000]:
initial_centroids = get_initial_centroids(tf_idf, k, seed)
centroids, cluster_assignment = kmeans(tf_idf, k, initial_centroids, maxiter=400,
record_heterogeneity=None, verbose=False)
# To save time, compute heterogeneity only once in the end
heterogeneity[seed] = compute_heterogeneity(tf_idf, k, centroids, cluster_assignment)
# This is the line we added for the next quiz question
cluster_assignment_dict[seed] = np.bincount(cluster_assignment)
# print('seed={0:06d}, heterogeneity={1:.5f}'.format(seed, heterogeneity[seed]))
# And this is the modified print statement
print('seed={0:06d}, heterogeneity={1:.5f}, cluster_distribution={2}'.format(seed, heterogeneity[seed],
cluster_assignment_dict[seed]))
sys.stdout.flush()
end = time.time()
print(end-start)
def smart_initialize(data, k, seed=None):
'''Use k-means++ to initialize a good set of centroids'''
if seed is not None: # useful for obtaining consistent results
np.random.seed(seed)
centroids = np.zeros((k, data.shape[1]))
# Randomly choose the first centroid.
# Since we have no prior knowledge, choose uniformly at random
idx = np.random.randint(data.shape[0])
centroids[0] = data[idx,:].toarray()
# Compute distances from the first centroid chosen to all the other data points
squared_distances = pairwise_distances(data, centroids[0:1], metric='euclidean').flatten()**2
for i in range(1, k):
# Choose the next centroid randomly, so that the probability for each data point to be chosen
# is directly proportional to its squared distance from the nearest centroid.
# Roughtly speaking, a new centroid should be as far as from ohter centroids as possible.
idx = np.random.choice(data.shape[0], 1, p=squared_distances/sum(squared_distances))
centroids[i] = data[idx,:].toarray()
# Now compute distances from the centroids to all data points
squared_distances = np.min(pairwise_distances(data, centroids[0:i+1], metric='euclidean')**2,axis=1)
return centroids
%%time
k = 10
heterogeneity_smart = {}
seeds = [0, 20000, 40000, 60000, 80000, 100000, 120000]
for seed in seeds:
initial_centroids = smart_initialize(tf_idf, k, seed)
centroids, cluster_assignment = kmeans(tf_idf, k, initial_centroids, maxiter=400,
record_heterogeneity=None, verbose=False)
# To save time, compute heterogeneity only once in the end
heterogeneity_smart[seed] = compute_heterogeneity(tf_idf, k, centroids, cluster_assignment)
print('seed={0:06d}, heterogeneity={1:.5f}'.format(seed, heterogeneity_smart[seed]))
sys.stdout.flush()
plt.figure(figsize=(8,5))
plt.boxplot([list(heterogeneity.values()), list(heterogeneity_smart.values())], vert=False)
plt.yticks([1, 2], ['k-means', 'k-means++'])
plt.rcParams.update({'font.size': 16})
plt.tight_layout()
def kmeans_multiple_runs(data, k, maxiter, num_runs, seed_list=None, verbose=False):
heterogeneity = {}
min_heterogeneity_achieved = float('inf')
best_seed = None
final_centroids = None
final_cluster_assignment = None
for i in range(num_runs):
# Use UTC time if no seeds are provided
if seed_list is not None:
seed = seed_list[i]
np.random.seed(seed)
else:
seed = int(time.time())
np.random.seed(seed)
# Use k-means++ initialization
# YOUR CODE HERE
initial_centroids = smart_initialize(data, k, seed)
# Run k-means
# YOUR CODE HERE
centroids, cluster_assignment = kmeans(tf_idf, k, initial_centroids, maxiter=400,
record_heterogeneity=None, verbose=False)
# To save time, compute heterogeneity only once in the end
# YOUR CODE HERE
heterogeneity[seed] = compute_heterogeneity(tf_idf, k, centroids, cluster_assignment)
if verbose:
print('seed={0:06d}, heterogeneity={1:.5f}'.format(seed, heterogeneity[seed]))
sys.stdout.flush()
# if current measurement of heterogeneity is lower than previously seen,
# update the minimum record of heterogeneity.
if heterogeneity[seed] < min_heterogeneity_achieved:
min_heterogeneity_achieved = heterogeneity[seed]
best_seed = seed
final_centroids = centroids
final_cluster_assignment = cluster_assignment
# Return the centroids and cluster assignments that minimize heterogeneity.
return final_centroids, final_cluster_assignment
%%time
import numpy as np
def plot_k_vs_heterogeneity(k_values, heterogeneity_values):
plt.figure(figsize=(7,4))
plt.plot(k_values, heterogeneity_values, linewidth=4)
plt.xlabel('K')
plt.ylabel('Heterogeneity')
plt.title('K vs. Heterogeneity')
plt.rcParams.update({'font.size': 16})
plt.tight_layout()
centroids = {}
cluster_assignment = {}
heterogeneity_values = []
k_list = [2, 10, 25, 50, 100]
#seed_list = [0]
# Uncomment the following line to run the plot with all the seeds (it may take about an hour to finish).
seed_list = [0, 20000, 40000, 60000, 80000, 100000, 120000]
for k in k_list:
heterogeneity = []
centroids[k], cluster_assignment[k] = kmeans_multiple_runs(tf_idf, k, maxiter=400,
num_runs=len(seed_list), seed_list=seed_list,
verbose=True)
score = compute_heterogeneity(tf_idf, k, centroids[k], cluster_assignment[k])
heterogeneity_values.append(score)
plot_k_vs_heterogeneity(k_list, heterogeneity_values)
def visualize_document_clusters(wiki, tf_idf, centroids, cluster_assignment, k, map_word_to_index, display_content=True):
'''wiki: original dataframe
tf_idf: data matrix, sparse matrix format
map_index_to_word: SFrame specifying the mapping betweeen words and column indices
display_content: if True, display 8 nearest neighbors of each centroid'''
map_index_to_word = {v:k for k,v in map_word_to_index.items()}
print('==========================================================')
# Visualize each cluster c
for c in range(k):
# Cluster heading
print('Cluster {0:d} '.format(c)),
# Print top 5 words with largest TF-IDF weights in the cluster
idx = centroids[c].argsort()[::-1]
for i in range(5): # Print each word along with the TF-IDF weight
print('{0:s}:{1:.3f}'.format(map_index_to_word[idx[i]], centroids[c][idx[i]])),
print('')
if display_content:
# Compute distances from the centroid to all data points in the cluster,
# and compute nearest neighbors of the centroids within the cluster.
distances = pairwise_distances(tf_idf, centroids[c].reshape(1, -1), metric='euclidean').flatten()
distances[cluster_assignment!=c] = float('inf') # remove non-members from consideration
nearest_neighbors = distances.argsort()
# For 8 nearest neighbors, print the title as well as first 180 characters of text.
# Wrap the text at 80-character mark.
for i in range(8):
text = ' '.join(wiki[nearest_neighbors[i]]['text'].split(None, 25)[0:25])
print('\n* {0:50s} {1:.5f}\n {2:s}\n {3:s}'.format(wiki[nearest_neighbors[i]]['name'],
distances[nearest_neighbors[i]], text[:90], text[90:180] if len(text) > 90 else ''))
print('==========================================================')
'''Notice the extra pairs of parentheses for centroids and cluster_assignment.
The centroid and cluster_assignment are still inside the npz file,
and we need to explicitly indicate when to load them into memory.'''
visualize_document_clusters(wiki, tf_idf, centroids[2], cluster_assignment[2], 2, map_index_to_word)
k = 10
visualize_document_clusters(wiki, tf_idf, centroids[k], cluster_assignment[k], k, map_index_to_word)
np.bincount(cluster_assignment[10])
visualize_document_clusters(wiki, tf_idf, centroids[25], cluster_assignment[25], 25,
map_index_to_word, display_content=False) # turn off text for brevity
k=100
visualize_document_clusters(wiki, tf_idf, centroids[k], cluster_assignment[k], k,
map_index_to_word, display_content=False)
# turn off text for brevity -- turn it on if you are curious ;)
sum(np.bincount(cluster_assignment[k]) < 236) / k
k
sum(np.bincount(cluster_assignment[k]) < 236)
!unzip images.sf.zip
from __future__ import print_function # to conform python 2.x print to python 3.x
import turicreate
import numpy as np
import matplotlib.pyplot as plt
import copy
from scipy.stats import multivariate_normal
%matplotlib inline
def log_sum_exp(Z):
""" Compute log(\sum_i exp(Z_i)) for some array Z."""
return np.max(Z) + np.log(np.sum(np.exp(Z - np.max(Z))))
def loglikelihood(data, weights, means, covs):
""" Compute the loglikelihood of the data for a Gaussian mixture model with the given parameters. """
num_clusters = len(means)
num_dim = len(data[0])
ll = 0
for d in data:
Z = np.zeros(num_clusters)
for k in range(num_clusters):
# Compute (x-mu)^T * Sigma^{-1} * (x-mu)
delta = np.array(d) - means[k]
exponent_term = np.dot(delta.T, np.dot(np.linalg.inv(covs[k]), delta))
# Compute loglikelihood contribution for this data point and this cluster
Z[k] += np.log(weights[k])
Z[k] -= 1/2. * (num_dim * np.log(2*np.pi) + np.log(np.linalg.det(covs[k])) + exponent_term)
# Increment loglikelihood contribution of this data point across all clusters
ll += log_sum_exp(Z)
return ll
def compute_responsibilities(data, weights, means, covariances):
'''E-step: compute responsibilities, given the current parameters'''
num_data = len(data)
num_clusters = len(means)
resp = np.zeros((num_data, num_clusters))
# Update resp matrix so that resp[i,k] is the responsibility of cluster k for data point i.
# Hint: To compute likelihood of seeing data point i given cluster k, use multivariate_normal.pdf.
for i in range(num_data):
for k in range(num_clusters):
# YOUR CODE HERE
resp[i, k] = ...
# Add up responsibilities over each data point and normalize
row_sums = resp.sum(axis=1)[:, np.newaxis]
resp = resp / row_sums
return resp
images = turicreate.SFrame('images.sf')
import array
images['rgb'] = images.pack_columns(['red', 'green', 'blue'])['X4']
# The result will pop out in a separate window
images.explore()
def log_sum_exp(Z):
""" Compute log(\sum_i exp(Z_i)) for some array Z."""
return np.max(Z) + np.log(np.sum(np.exp(Z - np.max(Z))))
def loglikelihood(data, weights, means, covs):
""" Compute the loglikelihood of the data for a Gaussian mixture model with the given parameters. """
num_clusters = len(means)
num_dim = len(data[0])
ll = 0
for d in data:
Z = np.zeros(num_clusters)
for k in range(num_clusters):
# Compute (x-mu)^T * Sigma^{-1} * (x-mu)
delta = np.array(d) - means[k]
exponent_term = np.dot(delta.T, np.dot(np.linalg.inv(covs[k]), delta))
# Compute loglikelihood contribution for this data point and this cluster
Z[k] += np.log(weights[k])
Z[k] -= 1/2. * (num_dim * np.log(2*np.pi) + np.log(np.linalg.det(covs[k])) + exponent_term)
# Increment loglikelihood contribution of this data point across all clusters
ll += log_sum_exp(Z)
return ll
import numpy as np
from scipy.stats import multivariate_normal
def compute_responsibilities(data, weights, means, covariances):
'''E-step: compute responsibilities, given the current parameters'''
num_data = len(data)
num_clusters = len(means)
resp = np.zeros((num_data, num_clusters))
# Update resp matrix so that resp[i,k] is the responsibility of cluster k for data point i.
# Hint: To compute likelihood of seeing data point i given cluster k, use multivariate_normal.pdf.
for i in range(num_data):
for k in range(num_clusters):
# YOUR CODE HERE
resp[i, k] = weights[k]*multivariate_normal.pdf(data[i], means[k], covariances[k])
# Add up responsibilities over each data point and normalize
row_sums = resp.sum(axis=1)[:, np.newaxis]
resp = resp / row_sums
return resp
def compute_soft_counts(resp):
# Compute the total responsibility assigned to each cluster, which will be useful when
# implementing M-steps below. In the lectures this is called N^{soft}
counts = np.sum(resp, axis=0)
return counts
def compute_weights(counts):
num_clusters = len(counts)
#weights = [0.] * num_clusters
#for k in range(num_clusters):
# Update the weight for cluster k using the M-step update rule for the cluster weight, \hat{\pi}_k.
# HINT: compute # of data points by summing soft counts.
# YOUR CODE HERE
#weights[k] = compute_soft_counts(resp[:,k]) / len(resp)
weights = counts / sum(counts)
return weights
def compute_means(data, resp, counts):
num_clusters = len(counts)
num_data = len(data)
means = [np.zeros(len(data[0]))] * num_clusters
for k in range(num_clusters):
# Update means for cluster k using the M-step update rule for the mean variables.
# This will assign the variable means[k] to be our estimate for \hat{\mu}_k.
weighted_sum = 0.
for i in range(num_data):
# YOUR CODE HERE
weighted_sum += np.sum(resp[i,k], axis=0) * data[i]
# YOUR CODE HERE
means[k] = weighted_sum / compute_soft_counts(resp[:,k])
return means
def compute_covariances(data, resp, counts, means):
num_clusters = len(counts)
num_dim = len(data[0])
num_data = len(data)
covariances = [np.zeros((num_dim,num_dim))] * num_clusters
for k in range(num_clusters):
# Update covariances for cluster k using the M-step update rule for covariance variables.
# This will assign the variable covariances[k] to be the estimate for \hat{\Sigma}_k.
weighted_sum = np.zeros((num_dim, num_dim))
for i in range(num_data):
# YOUR CODE HERE (Hint: Use np.outer on the data[i] and this cluster's mean)
weighted_sum += np.sum(resp[i,k], axis=0) * np.outer(data[i] - means[k], data[i] - means[k])
# YOUR CODE HERE
covariances[k] = weighted_sum / compute_soft_counts(resp[:,k])
return covariances
# SOLUTION
def EM(data, init_means, init_covariances, init_weights, maxiter=1000, thresh=1e-4):
# Make copies of initial parameters, which we will update during each iteration
means = init_means[:]
covariances = init_covariances[:]
weights = init_weights[:]
# Infer dimensions of dataset and the number of clusters
num_data = len(data)
num_dim = len(data[0])
num_clusters = len(means)
# Initialize some useful variables
resp = np.zeros((num_data, num_clusters))
ll = loglikelihood(data, weights, means, covariances)
ll_trace = [ll]
for it in range(maxiter):
if it % 5 == 0:
print("Iteration %s" % it)
# E-step: compute responsibilities
resp = compute_responsibilities(data, weights, means, covariances)
# M-step
# Compute the total responsibility assigned to each cluster, which will be useful when
# implementing M-steps below. In the lectures this is called N^{soft}
counts = compute_soft_counts(resp)
# Update the weight for cluster k using the M-step update rule for the cluster weight, \hat{\pi}_k.
# YOUR CODE HERE
weights = compute_weights(counts)
# Update means for cluster k using the M-step update rule for the mean variables.
# This will assign the variable means[k] to be our estimate for \hat{\mu}_k.
# YOUR CODE HERE
means = compute_means(data, resp, counts)
# Update covariances for cluster k using the M-step update rule for covariance variables.
# This will assign the variable covariances[k] to be the estimate for \hat{\Sigma}_k.
# YOUR CODE HERE
covariances = compute_covariances(data, resp, counts, means)
# Compute the loglikelihood at this iteration
# YOUR CODE HERE
ll_latest = loglikelihood(data, weights, means, covariances)
ll_trace.append(ll_latest)
# Check for convergence in log-likelihood and store
if (ll_latest - ll) < thresh and ll_latest > -np.inf:
break
ll = ll_latest
if it % 5 != 0:
print("Iteration %s" % it)
out = {'weights': weights, 'means': means, 'covs': covariances, 'loglik': ll_trace, 'resp': resp}
return out
np.random.seed(1)
# Initalize parameters
init_means = [images['rgb'][x] for x in np.random.choice(len(images), 4, replace=False)]
cov = np.diag([images['red'].var(), images['green'].var(), images['blue'].var()])
init_covariances = [cov, cov, cov, cov]
init_weights = [1/4., 1/4., 1/4., 1/4.]
# Convert rgb data to numpy arrays
img_data = [np.array(i) for i in images['rgb']]
# Run our EM algorithm on the image data using the above initializations.
# This should converge in about 125 iterations
out = EM(img_data, init_means, init_covariances, init_weights)
ll = out['loglik']
plt.plot(range(len(ll)),ll,linewidth=4)
plt.xlabel('Iteration')
plt.ylabel('Log-likelihood')
plt.rcParams.update({'font.size':16})
plt.tight_layout()
plt.figure()
plt.plot(range(3,len(ll)),ll[3:],linewidth=4)
plt.xlabel('Iteration')
plt.ylabel('Log-likelihood')
plt.rcParams.update({'font.size':16})
plt.tight_layout()
import colorsys
def plot_responsibilities_in_RB(img, resp, title):
N, K = resp.shape
HSV_tuples = [(x*1.0/K, 0.5, 0.9) for x in range(K)]
RGB_tuples = list(map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples))
R = img['red']
B = img['blue']
resp_by_img_int = [[resp[n][k] for k in range(K)] for n in range(N)]
cols = [tuple(np.dot(resp_by_img_int[n], np.array(RGB_tuples))) for n in range(N)]
plt.figure()
for n in range(len(R)):
plt.plot(R[n], B[n], 'o', c=cols[n])
plt.title(title)
plt.xlabel('R value')
plt.ylabel('B value')
plt.rcParams.update({'font.size':16})
plt.tight_layout()
N, K = out['resp'].shape
random_resp = np.random.dirichlet(np.ones(K), N)
plot_responsibilities_in_RB(images, random_resp, 'Random responsibilities')
out = EM(img_data, init_means, init_covariances, init_weights, maxiter=1)
plot_responsibilities_in_RB(images, out['resp'], 'After 1 iteration')
out = EM(img_data, init_means, init_covariances, init_weights, maxiter=20)
plot_responsibilities_in_RB(images, out['resp'], 'After 20 iterations')
loglikelihood([img_data[0]], out['weights'], out['means'], out['covs'])
for k in range(4):
print(out['weights'][k]*multivariate_normal.pdf(img_data[0], out['means'][k], out['covs'][k]))
weights = out['weights']
means = out['means']
covariances = out['covs']
rgb = images['rgb']
N = len(images) # number of images
K = len(means) # number of clusters
assignments = [0]*N
probs = [0]*N
for i in range(N):
# Compute the score of data point i under each Gaussian component:
p = np.zeros(K)
for k in range(K):
p[k] = weights[k]*multivariate_normal.pdf(rgb[i], mean=means[k], cov=covariances[k])
# Compute assignments of each data point to a given cluster based on the above scores:
assignments[i] = np.argmax(p)
# For data point i, store the corresponding score under this cluster assignment:
probs[i] = np.max(p)
assignments = turicreate.SFrame({'assignments':assignments, 'probs':probs, 'image': images['image']})
def get_top_images(assignments, cluster, k=5):
# YOUR CODE HERE
images_in_cluster = assignments[assignments['assignments'] == cluster]
top_images = images_in_cluster.topk('probs', k)
return top_images['image']
# Images will appear in a separate window
for component_id in range(4):
get_top_images(assignments, component_id).explore()
from __future__ import print_function # to conform python 2.x print to python 3.x
import turicreate
from em_utilities import *
wiki = turicreate.SFrame('people_wiki.sframe/').head(5000)
wiki['tf_idf'] = turicreate.text_analytics.tf_idf(wiki['text'])
wiki = wiki.add_row_number()
tf_idf, map_word_to_index = sframe_to_scipy(wiki, 'tf_idf')
map_index_to_word = dict([[map_word_to_index[i], i] for i in map_word_to_index.keys()])
%%time
tf_idf = normalize(tf_idf)
for i in range(5):
doc = tf_idf[i]
print(np.linalg.norm(doc.todense()))
%%time
from sklearn.cluster import KMeans
np.random.seed(5)
num_clusters = 25
# Use scikit-learn's k-means to simplify workflow
#kmeans_model = KMeans(n_clusters=num_clusters, n_init=5, max_iter=400, random_state=1, n_jobs=-1) # uncomment to use parallelism -- may break on your installation
kmeans_model = KMeans(n_clusters=num_clusters, n_init=5, max_iter=400, random_state=1, n_jobs=1)
kmeans_model.fit(tf_idf)
centroids, cluster_assignment = kmeans_model.cluster_centers_, kmeans_model.labels_
means = [centroid for centroid in centroids]
%%time
num_docs = tf_idf.shape[0]
weights = []
for i in range(num_clusters):
# Compute the number of data points assigned to cluster i:
num_assigned = np.sum(cluster_assignment == i) # YOUR CODE HERE
w = float(num_assigned) / num_docs
weights.append(w)
#cluster_assignment
covs = []
for i in range(num_clusters):
member_rows = tf_idf[cluster_assignment==i]
cov = (member_rows.multiply(member_rows) - 2*member_rows.dot(diag(means[i]))).sum(axis=0).A1 / member_rows.shape[0] \
+ means[i]**2
cov[cov < 1e-8] = 1e-8
covs.append(cov)
out = EM_for_high_dimension(tf_idf, means, covs, weights, cov_smoothing=1e-10)
out['loglik']
# Fill in the blanks
def visualize_EM_clusters(tf_idf, means, covs, map_index_to_word):
print('')
print('==========================================================')
num_clusters = len(means)
for c in range(num_clusters):
print('Cluster {0:d}: Largest mean parameters in cluster '.format(c))
print('\n{0: <12}{1: <12}{2: <12}'.format('Word', 'Mean', 'Variance'))
# The k'th element of sorted_word_ids should be the index of the word
# that has the k'th-largest value in the cluster mean. Hint: Use np.argsort().
sorted_word_ids = np.argsort(-means[c]) # YOUR CODE HERE
for i in sorted_word_ids[:5]:
print('{0: <12}{1:<10.2e}{2:10.2e}'.format(map_index_to_word[i],
means[c][i],
covs[c][i]))
print('\n==========================================================')
'''By EM'''
visualize_EM_clusters(tf_idf, out['means'], out['covs'], map_index_to_word)
np.random.seed(5) # See the note below to see why we set seed=5.
num_clusters = len(means)
num_docs, num_words = tf_idf.shape
random_means = []
random_covs = []
random_weights = []
for k in range(num_clusters):
# Create a numpy array of length num_words with random normally distributed values.
# Use the standard univariate normal distribution (mean 0, variance 1).
# YOUR CODE HERE
mean = np.random.normal(0, 1, num_words)
# Create a numpy array of length num_words with random values uniformly distributed between 1 and 5.
# YOUR CODE HERE
cov = np.random.uniform(1, 5, num_words)
# Initially give each cluster equal weight.
# YOUR CODE HERE
weight = 1 / num_clusters
random_means.append(mean)
random_covs.append(cov)
random_weights.append(weight)
len(random_means), len(random_covs), len(random_weights), num_words, num_docs, num_clusters, random_means[0].shape
out_random_init = EM_for_high_dimension(tf_idf, random_means, random_covs, random_weights, cov_smoothing=1e-5)
out_random_init['loglik']
num_clusters
out = EM_for_high_dimension(tf_idf, means, covs, weights, cov_smoothing=1e-5)
out['loglik']
visualize_EM_clusters(tf_idf, out_random_init['means'], out_random_init['covs'], map_index_to_word)